Thanks to Thomas Mock, Customer Enablement Lead at RStudio.
https://www.youtube.com/watch?v=MKwyauo8nSI&ab_channel=ThomasMock
Some parts of my presentation are partially inspired from his.
Thanks to Thomas Mock, Customer Enablement Lead at RStudio.
https://www.youtube.com/watch?v=MKwyauo8nSI&ab_channel=ThomasMock
Some parts of my presentation are partially inspired from his.
This presentation is available at https://www.justinsavoie.com/ssmw2022
Error messages are usually informative.
You can also google the error message.
Assignment
x <- 2.5
y <- 10
z <- x*y z
## [1] 25
zz <- z + 2 *x zz
## [1] 30
x1 <- c(2,3,4,10,12,46) x2 <- c(-1,3,4,10,5,3) x1+x2
## [1] 1 6 8 20 17 49
x1 <- c(2,3,4,10,12,46) x2 <- c(-1,3,4,10,5,3,1,1,1.5) x1+x2
## Warning in x1 + x2: longer object length is not a multiple of shorter object ## length
## [1] 1.0 6.0 8.0 20.0 17.0 49.0 3.0 4.0 5.5
my_dataframe <- data.frame(x=c(1,2,3),y=c(3.5,4.5,5.5)) my_dataframe
## x y ## 1 1 3.5 ## 2 2 4.5 ## 3 3 5.5
head(mtcars)
## mpg cyl disp hp drat wt qsec vs am gear carb ## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4 ## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4 ## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1 ## Hornet 4 Drive 21.4 6 258 110 3.08 3.215 19.44 1 0 3 1 ## Hornet Sportabout 18.7 8 360 175 3.15 3.440 17.02 0 0 3 2 ## Valiant 18.1 6 225 105 2.76 3.460 20.22 1 0 3 1
mean(c(2,3,4))
## [1] 3
random_vector<- rnorm(n=100,mean=0,sd=1) mean(random_vector)
## [1] 0.1199965
sd(random_vector)
## [1] 1.043842
IQR(random_vector)
## [1] 1.424318
summary(random_vector)
## Min. 1st Qu. Median Mean 3rd Qu. Max. ## -2.1130 -0.5941 0.1796 0.1200 0.8302 2.8603
set.seed(232) random_vector <- rnorm(n=5,mean=0,sd=1) set.seed(232) random_vector <- rnorm(5,0,1) mean(x=random_vector)
## [1] 0.233712
mean(random_vector)
## [1] 0.233712
mtcars$cyl[1:5]
## [1] 6 6 4 6 8
my_quadratic_function <- function(x){
return(x^2+6*x+14.5)
}
my_quadratic_function(mtcars$cyl[1:5])
## [1] 86.5 86.5 54.5 86.5 126.5
mtcars$cylQUADRATIC <- my_quadratic_function(mtcars$cyl) head(mtcars,3)
## mpg cyl disp hp drat wt qsec vs am gear carb cylQUADRATIC ## Mazda RX4 21.0 6 160 110 3.90 2.620 16.46 0 1 4 4 86.5 ## Mazda RX4 Wag 21.0 6 160 110 3.90 2.875 17.02 0 1 4 4 86.5 ## Datsun 710 22.8 4 108 93 3.85 2.320 18.61 1 1 4 1 54.5
seq(from=1,to=3.5,by=0.5)
## [1] 1.0 1.5 2.0 2.5 3.0 3.5
seq(from=1,to=3.5,by=0.51)
## [1] 1.00 1.51 2.02 2.53 3.04
seq(from=-2,to=2,length.out=7)
## [1] -2.0000000 -1.3333333 -0.6666667 0.0000000 0.6666667 1.3333333 2.0000000
table(mtcars$cyl)
## ## 4 6 8 ## 11 7 14
ifelse(c(3,3,6,8)>5,1,0)
## [1] 0 0 1 1
plot(mtcars$mpg,mtcars$hp)
x <- c(1,2,3) x[2]
## [1] 2
x[2:3]
## [1] 2 3
mtcars$mpg[c(1,3)]
## [1] 21.0 22.8
mtcars[1:3,4:5]
## hp drat ## Mazda RX4 110 3.90 ## Mazda RX4 Wag 110 3.90 ## Datsun 710 93 3.85
x[3] <- NA mtcars$mpg[mtcars$mpg>20]
## [1] 21.0 21.0 22.8 21.4 24.4 22.8 32.4 30.4 33.9 21.5 27.3 26.0 30.4 21.4
x <- mtcars$mpg x[x>20] <- 1000
x <- mtcars$mpg x[x>20] <- 1000 mtcars$mpg
## [1] 21.0 21.0 22.8 21.4 18.7 18.1 14.3 24.4 22.8 19.2 17.8 16.4 17.3 15.2 10.4 ## [16] 10.4 14.7 32.4 30.4 33.9 21.5 15.5 15.2 13.3 19.2 27.3 26.0 30.4 15.8 19.7 ## [31] 15.0 21.4
x
## [1] 1000.0 1000.0 1000.0 1000.0 18.7 18.1 14.3 1000.0 1000.0 19.2 ## [11] 17.8 16.4 17.3 15.2 10.4 10.4 14.7 1000.0 1000.0 1000.0 ## [21] 1000.0 15.5 15.2 13.3 19.2 1000.0 1000.0 1000.0 15.8 19.7 ## [31] 15.0 1000.0
my_data <- read_delim("/Users/justinsavoie/Downloads/
2019 Canadian Election Study - Phone Survey v1.0.tab")
setwd("~/Dropbox (Personal)/UofT/thisprojectimworkingon/")
my_data <- read_delim("/data/
2019 Canadian Election Study - Phone Survey v1.0.tab")
did_something <- do_something(data) did_another_thing <- do_another_thing(did_something) do_last_thing <- do_last_thing(did_another_thing)
final_thing <- do_last_thing(
do_another_thing(
do_something(data)
)
)
final_thing <- data %>% do_something() %>% do_another_thing() %>% do_last_thing()
mean(c(1,2,3));
## [1] 2
c(1,2,3) %>% mean()
## [1] 2
xplus6 <- function(x) x+6 xminus2 <- function(x) x+2 xtotwothird <- function(x) x^(2/3) my_vector <- c(3,4,2) my_vector %>% xplus6() %>% xminus2() %>% xtotwothird() %>% mean()
## [1] 4.943053
mean(xtotwothird(xminus2(xplus6(my_vector))))
## [1] 4.943053
do_something(data) is equivalent to:
ces <- read.csv("https://www.justinsavoie.com/data/dataces1.txt")
head(ces)
## q2_birthyear q3_gender q4_province q6_satisfied_democracy ## 1 1963 (1) Male (5) Quebec (3) Not very satisfied ## 2 1973 (1) Male (5) Quebec (2) Fairly satisfied ## 3 1994 (1) Male (5) Quebec (1) Very satisfied ## 4 2000 (1) Male (5) Quebec (2) Fairly satisfied ## 5 1984 (1) Male (5) Quebec (4) Not satisfied at all ## 6 1939 (1) Male (5) Quebec (3) Not very satisfied ## q9_interest_election q10_certain_vote ## 1 (8) (1) Certain ## 2 (10) Great deal of interest (1) Certain ## 3 (10) Great deal of interest (1) Certain ## 4 (6) (1) Certain ## 5 (10) Great deal of interest (1) Certain ## 6 (10) Great deal of interest (1) Certain ## q11_vote_intention ## 1 (-9) Don't know / Undecided ## 2 (-9) Don't know / Undecided ## 3 (1) Liberal (Grits) ## 4 (4) Bloc Québécois (BQ, PQ, Bloc, Parti Québécois) ## 5 (3) NDP (New Democratic Party, New Democrats, NDPers) ## 6 (4) Bloc Québécois (BQ, PQ, Bloc, Parti Québécois) ## q14_feeling_liberal_party q15_feeling_cons_party ## 1 60 40 ## 2 70 55 ## 3 70 60 ## 4 75 40 ## 5 10 10 ## 6 0 30
ces %>%
filter(q4_province=="(5) Quebec") %>%
group_by(q3_gender) %>%
summarise(mean_birthyear=mean(q2_birthyear),
sd=sd(q2_birthyear))
## # A tibble: 2 × 3 ## q3_gender mean_birthyear sd ## <chr> <dbl> <dbl> ## 1 (1) Male 1972. 15.9 ## 2 (2) Female 1970. 16.4
ces %>%
group_by(q4_province,q3_gender) %>%
summarise(mean_birthyear=mean(q2_birthyear),
sd=sd(q2_birthyear)) %>%
head(5)
## `summarise()` has grouped output by 'q4_province'. You can override using the ## `.groups` argument.
## # A tibble: 5 × 4 ## # Groups: q4_province [2] ## q4_province q3_gender mean_birthyear sd ## <chr> <chr> <dbl> <dbl> ## 1 (1) Newfoundland and Labrador (1) Male 1967. 15.7 ## 2 (1) Newfoundland and Labrador (2) Female 1965. 16.9 ## 3 (10) British Columbia (1) Male 1967. 17.3 ## 4 (10) British Columbia (2) Female 1966. 16.6 ## 5 (10) British Columbia (3) Other 1992 NA
The tidyverse is an opinionated collection of R packages designed for data science.
All packages share an underlying design philosophy, grammar, and data structures.
tidyverse is a R package. But it’s also a package of packages.
Core packages: readr, tidyr, dplyr, ggplot2
install.packages('tidyverse')
library(tidyverse)
read.csv("...")
read_csv("...")
plot(data$x,data$y)
ggplot(data,aes(x=x,y=y)) + geom_point()
tapply(mtcars$mpg,mtcars$cyl,mean)
## 4 6 8 ## 26.66364 19.74286 15.10000
mtcars %>% group_by(cyl) %>% summarise(mean(mpg))
## # A tibble: 3 × 2 ## cyl `mean(mpg)` ## <dbl> <dbl> ## 1 4 26.7 ## 2 6 19.7 ## 3 8 15.1
mtcars$new_var <- rnorm(nrow(mtcars),0,1) mtcars <- mtcars %>% mutate(new_var=rnorm(nrow(.),0,1))
head(ces)
## q2_birthyear q3_gender q4_province q6_satisfied_democracy ## 1 1963 (1) Male (5) Quebec (3) Not very satisfied ## 2 1973 (1) Male (5) Quebec (2) Fairly satisfied ## 3 1994 (1) Male (5) Quebec (1) Very satisfied ## 4 2000 (1) Male (5) Quebec (2) Fairly satisfied ## 5 1984 (1) Male (5) Quebec (4) Not satisfied at all ## 6 1939 (1) Male (5) Quebec (3) Not very satisfied ## q9_interest_election q10_certain_vote ## 1 (8) (1) Certain ## 2 (10) Great deal of interest (1) Certain ## 3 (10) Great deal of interest (1) Certain ## 4 (6) (1) Certain ## 5 (10) Great deal of interest (1) Certain ## 6 (10) Great deal of interest (1) Certain ## q11_vote_intention ## 1 (-9) Don't know / Undecided ## 2 (-9) Don't know / Undecided ## 3 (1) Liberal (Grits) ## 4 (4) Bloc Québécois (BQ, PQ, Bloc, Parti Québécois) ## 5 (3) NDP (New Democratic Party, New Democrats, NDPers) ## 6 (4) Bloc Québécois (BQ, PQ, Bloc, Parti Québécois) ## q14_feeling_liberal_party q15_feeling_cons_party ## 1 60 40 ## 2 70 55 ## 3 70 60 ## 4 75 40 ## 5 10 10 ## 6 0 30
(untidy_df <- tibble(age=18:30,
male_2016=round(rnorm(13,50000,5000)),
female_2016=round(rnorm(13,50000,5000)),
male_2017=round(rnorm(13,50000,5000)),
female_2017=round(rnorm(13,50000,5000)),
male_2018=round(rnorm(13,50000,5000)),
female_2018=round(rnorm(13,50000,5000))))
## # A tibble: 13 × 7 ## age male_2016 female_2016 male_2017 female_2017 male_2018 female_2018 ## <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 18 59564 54647 61525 46469 43561 57906 ## 2 19 49530 48047 52844 50596 53742 50753 ## 3 20 52298 65491 47306 58496 52360 47564 ## 4 21 54563 40586 49413 50184 49165 50879 ## 5 22 58411 52824 49071 54078 54008 55440 ## 6 23 52324 56537 51123 47459 50506 51485 ## 7 24 56501 53670 56503 54379 46414 44554 ## 8 25 54753 45761 53031 47378 51845 49421 ## 9 26 59393 56615 46247 52151 55203 38155 ## 10 27 52496 47475 48400 36441 48584 54391 ## 11 28 53883 46741 49376 55957 43365 39146 ## 12 29 42629 44526 50054 44947 53615 44120 ## 13 30 46296 43434 50805 49060 44310 40083
(tidy_df <- untidy_df %>%
pivot_longer(-age,names_to = c("gender","year"),
values_to = "value",names_sep = "_"))
## # A tibble: 78 × 4 ## age gender year value ## <int> <chr> <chr> <dbl> ## 1 18 male 2016 59564 ## 2 18 female 2016 54647 ## 3 18 male 2017 61525 ## 4 18 female 2017 46469 ## 5 18 male 2018 43561 ## 6 18 female 2018 57906 ## 7 19 male 2016 49530 ## 8 19 female 2016 48047 ## 9 19 male 2017 52844 ## 10 19 female 2017 50596 ## # … with 68 more rows
Read in data with readr, haven, readxl.
I’ve also used readstata13 which is not in tidyverse.
readr
read_csv(),read_tsv(),read_delim()haven
read_sas(),read_spss(),read_stata()readxl
read_xls(),read_xlsx(),read_excel()filter() keep only certain rowsarrange() order rows by order in a variableselect() keep only certain variablesmutate() create a variablegroup_by() group by values in a variablesummarise() summarise e.g. mean, sd etc.pull() extract one variable and make it vectorn() and count()glimpse() give summary of datasummarize_at summarise on many variablesmutate_at create/modify many variablessummarize_allmutate_allclass(mtcars)
## [1] "data.frame"
mtcars_tbl <- as_tibble(mtcars) class(mtcars_tbl)
## [1] "tbl_df" "tbl" "data.frame"
mtcars
## mpg cyl disp hp drat wt qsec vs am gear carb ## Mazda RX4 21.0 6 160.0 110 3.90 2.620 16.46 0 1 4 4 ## Mazda RX4 Wag 21.0 6 160.0 110 3.90 2.875 17.02 0 1 4 4 ## Datsun 710 22.8 4 108.0 93 3.85 2.320 18.61 1 1 4 1 ## Hornet 4 Drive 21.4 6 258.0 110 3.08 3.215 19.44 1 0 3 1 ## Hornet Sportabout 18.7 8 360.0 175 3.15 3.440 17.02 0 0 3 2 ## Valiant 18.1 6 225.0 105 2.76 3.460 20.22 1 0 3 1 ## Duster 360 14.3 8 360.0 245 3.21 3.570 15.84 0 0 3 4 ## Merc 240D 24.4 4 146.7 62 3.69 3.190 20.00 1 0 4 2 ## Merc 230 22.8 4 140.8 95 3.92 3.150 22.90 1 0 4 2 ## Merc 280 19.2 6 167.6 123 3.92 3.440 18.30 1 0 4 4 ## Merc 280C 17.8 6 167.6 123 3.92 3.440 18.90 1 0 4 4 ## Merc 450SE 16.4 8 275.8 180 3.07 4.070 17.40 0 0 3 3 ## Merc 450SL 17.3 8 275.8 180 3.07 3.730 17.60 0 0 3 3 ## Merc 450SLC 15.2 8 275.8 180 3.07 3.780 18.00 0 0 3 3 ## Cadillac Fleetwood 10.4 8 472.0 205 2.93 5.250 17.98 0 0 3 4 ## Lincoln Continental 10.4 8 460.0 215 3.00 5.424 17.82 0 0 3 4 ## Chrysler Imperial 14.7 8 440.0 230 3.23 5.345 17.42 0 0 3 4 ## Fiat 128 32.4 4 78.7 66 4.08 2.200 19.47 1 1 4 1 ## Honda Civic 30.4 4 75.7 52 4.93 1.615 18.52 1 1 4 2 ## Toyota Corolla 33.9 4 71.1 65 4.22 1.835 19.90 1 1 4 1 ## Toyota Corona 21.5 4 120.1 97 3.70 2.465 20.01 1 0 3 1 ## Dodge Challenger 15.5 8 318.0 150 2.76 3.520 16.87 0 0 3 2 ## AMC Javelin 15.2 8 304.0 150 3.15 3.435 17.30 0 0 3 2 ## Camaro Z28 13.3 8 350.0 245 3.73 3.840 15.41 0 0 3 4 ## Pontiac Firebird 19.2 8 400.0 175 3.08 3.845 17.05 0 0 3 2 ## Fiat X1-9 27.3 4 79.0 66 4.08 1.935 18.90 1 1 4 1 ## Porsche 914-2 26.0 4 120.3 91 4.43 2.140 16.70 0 1 5 2 ## Lotus Europa 30.4 4 95.1 113 3.77 1.513 16.90 1 1 5 2 ## Ford Pantera L 15.8 8 351.0 264 4.22 3.170 14.50 0 1 5 4 ## Ferrari Dino 19.7 6 145.0 175 3.62 2.770 15.50 0 1 5 6 ## Maserati Bora 15.0 8 301.0 335 3.54 3.570 14.60 0 1 5 8 ## Volvo 142E 21.4 4 121.0 109 4.11 2.780 18.60 1 1 4 2 ## cylQUADRATIC new_var ## Mazda RX4 86.5 0.03041931 ## Mazda RX4 Wag 86.5 -0.53822260 ## Datsun 710 54.5 -1.04377473 ## Hornet 4 Drive 86.5 1.97747401 ## Hornet Sportabout 126.5 0.59528221 ## Valiant 86.5 1.86370098 ## Duster 360 126.5 -2.40461085 ## Merc 240D 54.5 0.08163916 ## Merc 230 54.5 0.64849342 ## Merc 280 86.5 1.12061364 ## Merc 280C 86.5 -0.89804433 ## Merc 450SE 126.5 -0.93955816 ## Merc 450SL 126.5 -0.38454992 ## Merc 450SLC 126.5 -1.00229585 ## Cadillac Fleetwood 126.5 0.03196600 ## Lincoln Continental 126.5 -0.57631515 ## Chrysler Imperial 126.5 1.23589376 ## Fiat 128 54.5 0.39147670 ## Honda Civic 54.5 -0.48915168 ## Toyota Corolla 54.5 -0.91361244 ## Toyota Corona 54.5 0.13172686 ## Dodge Challenger 126.5 -0.81758813 ## AMC Javelin 126.5 -0.74465426 ## Camaro Z28 126.5 0.97414435 ## Pontiac Firebird 126.5 0.71260322 ## Fiat X1-9 54.5 -2.11868654 ## Porsche 914-2 54.5 0.65684192 ## Lotus Europa 54.5 1.48618992 ## Ford Pantera L 126.5 -0.39842097 ## Ferrari Dino 86.5 -0.39872318 ## Maserati Bora 126.5 0.67989343 ## Volvo 142E 54.5 -1.34404902
mtcars_tbl
## # A tibble: 32 × 13 ## mpg cyl disp hp drat wt qsec vs am gear carb ## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 21 6 160 110 3.9 2.62 16.5 0 1 4 4 ## 2 21 6 160 110 3.9 2.88 17.0 0 1 4 4 ## 3 22.8 4 108 93 3.85 2.32 18.6 1 1 4 1 ## 4 21.4 6 258 110 3.08 3.22 19.4 1 0 3 1 ## 5 18.7 8 360 175 3.15 3.44 17.0 0 0 3 2 ## 6 18.1 6 225 105 2.76 3.46 20.2 1 0 3 1 ## 7 14.3 8 360 245 3.21 3.57 15.8 0 0 3 4 ## 8 24.4 4 147. 62 3.69 3.19 20 1 0 4 2 ## 9 22.8 4 141. 95 3.92 3.15 22.9 1 0 4 2 ## 10 19.2 6 168. 123 3.92 3.44 18.3 1 0 4 4 ## # … with 22 more rows, and 2 more variables: cylQUADRATIC <dbl>, new_var <dbl>
mtcars_tbl <- mtcars %>% mutate(name=row.names(.)) %>% as_tibble() head(mtcars_tbl,3)
## # A tibble: 3 × 14 ## mpg cyl disp hp drat wt qsec vs am gear carb cylQUADRATIC ## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 21 6 160 110 3.9 2.62 16.5 0 1 4 4 86.5 ## 2 21 6 160 110 3.9 2.88 17.0 0 1 4 4 86.5 ## 3 22.8 4 108 93 3.85 2.32 18.6 1 1 4 1 54.5 ## # … with 2 more variables: new_var <dbl>, name <chr>
mtcars <- mtcars_tbl
mtcars %>% slice(c(1,2,3))
## # A tibble: 3 × 14 ## mpg cyl disp hp drat wt qsec vs am gear carb cylQUADRATIC ## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 21 6 160 110 3.9 2.62 16.5 0 1 4 4 86.5 ## 2 21 6 160 110 3.9 2.88 17.0 0 1 4 4 86.5 ## 3 22.8 4 108 93 3.85 2.32 18.6 1 1 4 1 54.5 ## # … with 2 more variables: new_var <dbl>, name <chr>
mtcars %>% slice(c(1,4,5))
## # A tibble: 3 × 14 ## mpg cyl disp hp drat wt qsec vs am gear carb cylQUADRATIC ## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 21 6 160 110 3.9 2.62 16.5 0 1 4 4 86.5 ## 2 21.4 6 258 110 3.08 3.22 19.4 1 0 3 1 86.5 ## 3 18.7 8 360 175 3.15 3.44 17.0 0 0 3 2 126. ## # … with 2 more variables: new_var <dbl>, name <chr>
mtcars %>% glimpse()
## Rows: 32 ## Columns: 14 ## $ mpg <dbl> 21.0, 21.0, 22.8, 21.4, 18.7, 18.1, 14.3, 24.4, 22.8, 19.… ## $ cyl <dbl> 6, 6, 4, 6, 8, 6, 8, 4, 4, 6, 6, 8, 8, 8, 8, 8, 8, 4, 4, … ## $ disp <dbl> 160.0, 160.0, 108.0, 258.0, 360.0, 225.0, 360.0, 146.7, 1… ## $ hp <dbl> 110, 110, 93, 110, 175, 105, 245, 62, 95, 123, 123, 180, … ## $ drat <dbl> 3.90, 3.90, 3.85, 3.08, 3.15, 2.76, 3.21, 3.69, 3.92, 3.9… ## $ wt <dbl> 2.620, 2.875, 2.320, 3.215, 3.440, 3.460, 3.570, 3.190, 3… ## $ qsec <dbl> 16.46, 17.02, 18.61, 19.44, 17.02, 20.22, 15.84, 20.00, 2… ## $ vs <dbl> 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, … ## $ am <dbl> 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, … ## $ gear <dbl> 4, 4, 4, 3, 3, 3, 3, 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 4, 4, … ## $ carb <dbl> 4, 4, 1, 1, 2, 1, 4, 2, 2, 4, 4, 3, 3, 3, 4, 4, 4, 1, 2, … ## $ cylQUADRATIC <dbl> 86.5, 86.5, 54.5, 86.5, 126.5, 86.5, 126.5, 54.5, 54.5, 8… ## $ new_var <dbl> 0.03041931, -0.53822260, -1.04377473, 1.97747401, 0.59528… ## $ name <chr> "Mazda RX4", "Mazda RX4 Wag", "Datsun 710", "Hornet 4 Dri…
mtcars %>% filter(cyl==4)
## # A tibble: 11 × 14 ## mpg cyl disp hp drat wt qsec vs am gear carb ## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 22.8 4 108 93 3.85 2.32 18.6 1 1 4 1 ## 2 24.4 4 147. 62 3.69 3.19 20 1 0 4 2 ## 3 22.8 4 141. 95 3.92 3.15 22.9 1 0 4 2 ## 4 32.4 4 78.7 66 4.08 2.2 19.5 1 1 4 1 ## 5 30.4 4 75.7 52 4.93 1.62 18.5 1 1 4 2 ## 6 33.9 4 71.1 65 4.22 1.84 19.9 1 1 4 1 ## 7 21.5 4 120. 97 3.7 2.46 20.0 1 0 3 1 ## 8 27.3 4 79 66 4.08 1.94 18.9 1 1 4 1 ## 9 26 4 120. 91 4.43 2.14 16.7 0 1 5 2 ## 10 30.4 4 95.1 113 3.77 1.51 16.9 1 1 5 2 ## 11 21.4 4 121 109 4.11 2.78 18.6 1 1 4 2 ## # … with 3 more variables: cylQUADRATIC <dbl>, new_var <dbl>, name <chr>
mtcars %>% filter(cyl!=4)
## # A tibble: 21 × 14 ## mpg cyl disp hp drat wt qsec vs am gear carb ## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 21 6 160 110 3.9 2.62 16.5 0 1 4 4 ## 2 21 6 160 110 3.9 2.88 17.0 0 1 4 4 ## 3 21.4 6 258 110 3.08 3.22 19.4 1 0 3 1 ## 4 18.7 8 360 175 3.15 3.44 17.0 0 0 3 2 ## 5 18.1 6 225 105 2.76 3.46 20.2 1 0 3 1 ## 6 14.3 8 360 245 3.21 3.57 15.8 0 0 3 4 ## 7 19.2 6 168. 123 3.92 3.44 18.3 1 0 4 4 ## 8 17.8 6 168. 123 3.92 3.44 18.9 1 0 4 4 ## 9 16.4 8 276. 180 3.07 4.07 17.4 0 0 3 3 ## 10 17.3 8 276. 180 3.07 3.73 17.6 0 0 3 3 ## # … with 11 more rows, and 3 more variables: cylQUADRATIC <dbl>, new_var <dbl>, ## # name <chr>
mtcars %>% filter(cyl %in% c(4,6))
## # A tibble: 18 × 14 ## mpg cyl disp hp drat wt qsec vs am gear carb ## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 21 6 160 110 3.9 2.62 16.5 0 1 4 4 ## 2 21 6 160 110 3.9 2.88 17.0 0 1 4 4 ## 3 22.8 4 108 93 3.85 2.32 18.6 1 1 4 1 ## 4 21.4 6 258 110 3.08 3.22 19.4 1 0 3 1 ## 5 18.1 6 225 105 2.76 3.46 20.2 1 0 3 1 ## 6 24.4 4 147. 62 3.69 3.19 20 1 0 4 2 ## 7 22.8 4 141. 95 3.92 3.15 22.9 1 0 4 2 ## 8 19.2 6 168. 123 3.92 3.44 18.3 1 0 4 4 ## 9 17.8 6 168. 123 3.92 3.44 18.9 1 0 4 4 ## 10 32.4 4 78.7 66 4.08 2.2 19.5 1 1 4 1 ## 11 30.4 4 75.7 52 4.93 1.62 18.5 1 1 4 2 ## 12 33.9 4 71.1 65 4.22 1.84 19.9 1 1 4 1 ## 13 21.5 4 120. 97 3.7 2.46 20.0 1 0 3 1 ## 14 27.3 4 79 66 4.08 1.94 18.9 1 1 4 1 ## 15 26 4 120. 91 4.43 2.14 16.7 0 1 5 2 ## 16 30.4 4 95.1 113 3.77 1.51 16.9 1 1 5 2 ## 17 19.7 6 145 175 3.62 2.77 15.5 0 1 5 6 ## 18 21.4 4 121 109 4.11 2.78 18.6 1 1 4 2 ## # … with 3 more variables: cylQUADRATIC <dbl>, new_var <dbl>, name <chr>
select(mtcars,hp,mpg,cyl)
## # A tibble: 32 × 3 ## hp mpg cyl ## <dbl> <dbl> <dbl> ## 1 110 21 6 ## 2 110 21 6 ## 3 93 22.8 4 ## 4 110 21.4 6 ## 5 175 18.7 8 ## 6 105 18.1 6 ## 7 245 14.3 8 ## 8 62 24.4 4 ## 9 95 22.8 4 ## 10 123 19.2 6 ## # … with 22 more rows
mtcars %>% select(-mpg)
## # A tibble: 32 × 13 ## cyl disp hp drat wt qsec vs am gear carb cylQUADRATIC ## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 6 160 110 3.9 2.62 16.5 0 1 4 4 86.5 ## 2 6 160 110 3.9 2.88 17.0 0 1 4 4 86.5 ## 3 4 108 93 3.85 2.32 18.6 1 1 4 1 54.5 ## 4 6 258 110 3.08 3.22 19.4 1 0 3 1 86.5 ## 5 8 360 175 3.15 3.44 17.0 0 0 3 2 126. ## 6 6 225 105 2.76 3.46 20.2 1 0 3 1 86.5 ## 7 8 360 245 3.21 3.57 15.8 0 0 3 4 126. ## 8 4 147. 62 3.69 3.19 20 1 0 4 2 54.5 ## 9 4 141. 95 3.92 3.15 22.9 1 0 4 2 54.5 ## 10 6 168. 123 3.92 3.44 18.3 1 0 4 4 86.5 ## # … with 22 more rows, and 2 more variables: new_var <dbl>, name <chr>
mtcars %>% select(starts_with("c"),starts_with("h"))
## # A tibble: 32 × 4 ## cyl carb cylQUADRATIC hp ## <dbl> <dbl> <dbl> <dbl> ## 1 6 4 86.5 110 ## 2 6 4 86.5 110 ## 3 4 1 54.5 93 ## 4 6 1 86.5 110 ## 5 8 2 126. 175 ## 6 6 1 86.5 105 ## 7 8 4 126. 245 ## 8 4 2 54.5 62 ## 9 4 2 54.5 95 ## 10 6 4 86.5 123 ## # … with 22 more rows
mtcars %>% arrange(mpg)
## # A tibble: 32 × 14 ## mpg cyl disp hp drat wt qsec vs am gear carb ## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 10.4 8 472 205 2.93 5.25 18.0 0 0 3 4 ## 2 10.4 8 460 215 3 5.42 17.8 0 0 3 4 ## 3 13.3 8 350 245 3.73 3.84 15.4 0 0 3 4 ## 4 14.3 8 360 245 3.21 3.57 15.8 0 0 3 4 ## 5 14.7 8 440 230 3.23 5.34 17.4 0 0 3 4 ## 6 15 8 301 335 3.54 3.57 14.6 0 1 5 8 ## 7 15.2 8 276. 180 3.07 3.78 18 0 0 3 3 ## 8 15.2 8 304 150 3.15 3.44 17.3 0 0 3 2 ## 9 15.5 8 318 150 2.76 3.52 16.9 0 0 3 2 ## 10 15.8 8 351 264 4.22 3.17 14.5 0 1 5 4 ## # … with 22 more rows, and 3 more variables: cylQUADRATIC <dbl>, new_var <dbl>, ## # name <chr>
mtcars %>% arrange(desc(mpg))
## # A tibble: 32 × 14 ## mpg cyl disp hp drat wt qsec vs am gear carb ## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 33.9 4 71.1 65 4.22 1.84 19.9 1 1 4 1 ## 2 32.4 4 78.7 66 4.08 2.2 19.5 1 1 4 1 ## 3 30.4 4 75.7 52 4.93 1.62 18.5 1 1 4 2 ## 4 30.4 4 95.1 113 3.77 1.51 16.9 1 1 5 2 ## 5 27.3 4 79 66 4.08 1.94 18.9 1 1 4 1 ## 6 26 4 120. 91 4.43 2.14 16.7 0 1 5 2 ## 7 24.4 4 147. 62 3.69 3.19 20 1 0 4 2 ## 8 22.8 4 108 93 3.85 2.32 18.6 1 1 4 1 ## 9 22.8 4 141. 95 3.92 3.15 22.9 1 0 4 2 ## 10 21.5 4 120. 97 3.7 2.46 20.0 1 0 3 1 ## # … with 22 more rows, and 3 more variables: cylQUADRATIC <dbl>, new_var <dbl>, ## # name <chr>
mtcars %>% arrange(desc(cyl),disp)
## # A tibble: 32 × 14 ## mpg cyl disp hp drat wt qsec vs am gear carb ## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 16.4 8 276. 180 3.07 4.07 17.4 0 0 3 3 ## 2 17.3 8 276. 180 3.07 3.73 17.6 0 0 3 3 ## 3 15.2 8 276. 180 3.07 3.78 18 0 0 3 3 ## 4 15 8 301 335 3.54 3.57 14.6 0 1 5 8 ## 5 15.2 8 304 150 3.15 3.44 17.3 0 0 3 2 ## 6 15.5 8 318 150 2.76 3.52 16.9 0 0 3 2 ## 7 13.3 8 350 245 3.73 3.84 15.4 0 0 3 4 ## 8 15.8 8 351 264 4.22 3.17 14.5 0 1 5 4 ## 9 18.7 8 360 175 3.15 3.44 17.0 0 0 3 2 ## 10 14.3 8 360 245 3.21 3.57 15.8 0 0 3 4 ## # … with 22 more rows, and 3 more variables: cylQUADRATIC <dbl>, new_var <dbl>, ## # name <chr>
mtcars %>% mutate(hpsquare=hp^2) %>% select(mpg,cyl,disp,hp,hpsquare)
## # A tibble: 32 × 5 ## mpg cyl disp hp hpsquare ## <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 21 6 160 110 12100 ## 2 21 6 160 110 12100 ## 3 22.8 4 108 93 8649 ## 4 21.4 6 258 110 12100 ## 5 18.7 8 360 175 30625 ## 6 18.1 6 225 105 11025 ## 7 14.3 8 360 245 60025 ## 8 24.4 4 147. 62 3844 ## 9 22.8 4 141. 95 9025 ## 10 19.2 6 168. 123 15129 ## # … with 22 more rows
mtcars %>%
mutate(randomnoise=rnorm(nrow(.),mean=0,sd=1),
mpg_with_random_noise = mpg+randomnoise) %>%
select(mpg,cyl,disp,randomnoise,mpg_with_random_noise)
## # A tibble: 32 × 5 ## mpg cyl disp randomnoise mpg_with_random_noise ## <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 21 6 160 -0.997 20.0 ## 2 21 6 160 2.26 23.3 ## 3 22.8 4 108 -1.15 21.6 ## 4 21.4 6 258 -0.848 20.6 ## 5 18.7 8 360 -0.462 18.2 ## 6 18.1 6 225 -0.405 17.7 ## 7 14.3 8 360 0.671 15.0 ## 8 24.4 4 147. -0.857 23.5 ## 9 22.8 4 141. -0.522 22.3 ## 10 19.2 6 168. 0.359 19.6 ## # … with 22 more rows
mtcars %>%
mutate(cyl=factor(cyl,levels=c(4,6,8),
labels=c("4 Cyl","6 Cyl","8 Cyl")))
## # A tibble: 32 × 14 ## mpg cyl disp hp drat wt qsec vs am gear carb ## <dbl> <fct> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 21 6 Cyl 160 110 3.9 2.62 16.5 0 1 4 4 ## 2 21 6 Cyl 160 110 3.9 2.88 17.0 0 1 4 4 ## 3 22.8 4 Cyl 108 93 3.85 2.32 18.6 1 1 4 1 ## 4 21.4 6 Cyl 258 110 3.08 3.22 19.4 1 0 3 1 ## 5 18.7 8 Cyl 360 175 3.15 3.44 17.0 0 0 3 2 ## 6 18.1 6 Cyl 225 105 2.76 3.46 20.2 1 0 3 1 ## 7 14.3 8 Cyl 360 245 3.21 3.57 15.8 0 0 3 4 ## 8 24.4 4 Cyl 147. 62 3.69 3.19 20 1 0 4 2 ## 9 22.8 4 Cyl 141. 95 3.92 3.15 22.9 1 0 4 2 ## 10 19.2 6 Cyl 168. 123 3.92 3.44 18.3 1 0 4 4 ## # … with 22 more rows, and 3 more variables: cylQUADRATIC <dbl>, new_var <dbl>, ## # name <chr>
vect <- c("Much less","About the same","Much more")
class(vect)
## [1] "character"
table(vect)
## vect ## About the same Much less Much more ## 1 1 1
vect <- factor(vect,levels=c("Much less","About the same","Much more"))
class(vect)
## [1] "factor"
vect <- factor(vect,levels=c("Much less","About the same","Much more"),
labels=c("ml","abs","mm"))
table(vect)
## vect ## ml abs mm ## 1 1 1
vect <- 1:22 factor(vect)
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 ## Levels: 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22
vect <- as.character(1:22) factor(vect)
## [1] 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 ## Levels: 1 10 11 12 13 14 15 16 17 18 19 2 20 21 22 3 4 5 6 7 8 9
mtcars %>%
mutate(cyl=factor(cyl,levels=c(4,6,8),
labels=c("4 Cyl","6 Cyl","8 Cyl")),
cyl2 = fct_recode(cyl,"Small"="4 Cyl",
"Big"="6 Cyl",
"Big"="8 Cyl")) %>%
select(cyl,cyl2)
## # A tibble: 32 × 2 ## cyl cyl2 ## <fct> <fct> ## 1 6 Cyl Big ## 2 6 Cyl Big ## 3 4 Cyl Small ## 4 6 Cyl Big ## 5 8 Cyl Big ## 6 6 Cyl Big ## 7 8 Cyl Big ## 8 4 Cyl Small ## 9 4 Cyl Small ## 10 6 Cyl Big ## # … with 22 more rows
temp <- mtcars %>%
mutate(cyl_f=factor(cyl,levels=c(4,6,8),
labels=c("4 Cyl","6 Cyl","8 Cyl")),
cyl2 = fct_recode(cyl_f,"Small"="4 Cyl","Big"="6 Cyl","Big"="8 Cyl"),
cyl2REV=fct_relevel(cyl2,"Big","Small"))
table(temp$cyl2)
## ## Small Big ## 11 21
table(temp$cyl2REV)
## ## Big Small ## 21 11
mtcars %>%
mutate(miles_per_liter = mpg*3.78,
miles_per_gallon=miles_per_liter/3.78) %>%
select(miles_per_liter,mpg,miles_per_gallon)
## # A tibble: 32 × 3 ## miles_per_liter mpg miles_per_gallon ## <dbl> <dbl> <dbl> ## 1 79.4 21 21 ## 2 79.4 21 21 ## 3 86.2 22.8 22.8 ## 4 80.9 21.4 21.4 ## 5 70.7 18.7 18.7 ## 6 68.4 18.1 18.1 ## 7 54.1 14.3 14.3 ## 8 92.2 24.4 24.4 ## 9 86.2 22.8 22.8 ## 10 72.6 19.2 19.2 ## # … with 22 more rows
mtcars %>% group_by(cyl)
## # A tibble: 32 × 14 ## # Groups: cyl [3] ## mpg cyl disp hp drat wt qsec vs am gear carb ## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 21 6 160 110 3.9 2.62 16.5 0 1 4 4 ## 2 21 6 160 110 3.9 2.88 17.0 0 1 4 4 ## 3 22.8 4 108 93 3.85 2.32 18.6 1 1 4 1 ## 4 21.4 6 258 110 3.08 3.22 19.4 1 0 3 1 ## 5 18.7 8 360 175 3.15 3.44 17.0 0 0 3 2 ## 6 18.1 6 225 105 2.76 3.46 20.2 1 0 3 1 ## 7 14.3 8 360 245 3.21 3.57 15.8 0 0 3 4 ## 8 24.4 4 147. 62 3.69 3.19 20 1 0 4 2 ## 9 22.8 4 141. 95 3.92 3.15 22.9 1 0 4 2 ## 10 19.2 6 168. 123 3.92 3.44 18.3 1 0 4 4 ## # … with 22 more rows, and 3 more variables: cylQUADRATIC <dbl>, new_var <dbl>, ## # name <chr>
mtcars %>% group_by(cyl) %>% mutate(mean_mpg_per_cyl=mean(mpg)) %>% select(mpg,cyl,disp,mean_mpg_per_cyl)
## # A tibble: 32 × 4 ## # Groups: cyl [3] ## mpg cyl disp mean_mpg_per_cyl ## <dbl> <dbl> <dbl> <dbl> ## 1 21 6 160 19.7 ## 2 21 6 160 19.7 ## 3 22.8 4 108 26.7 ## 4 21.4 6 258 19.7 ## 5 18.7 8 360 15.1 ## 6 18.1 6 225 19.7 ## 7 14.3 8 360 15.1 ## 8 24.4 4 147. 26.7 ## 9 22.8 4 141. 26.7 ## 10 19.2 6 168. 19.7 ## # … with 22 more rows
mtcars %>% group_by(cyl) %>% mutate(max_mpg_per_cyl=max(mpg))%>% select(mpg,cyl,disp,max_mpg_per_cyl)
## # A tibble: 32 × 4 ## # Groups: cyl [3] ## mpg cyl disp max_mpg_per_cyl ## <dbl> <dbl> <dbl> <dbl> ## 1 21 6 160 21.4 ## 2 21 6 160 21.4 ## 3 22.8 4 108 33.9 ## 4 21.4 6 258 21.4 ## 5 18.7 8 360 19.2 ## 6 18.1 6 225 21.4 ## 7 14.3 8 360 19.2 ## 8 24.4 4 147. 33.9 ## 9 22.8 4 141. 33.9 ## 10 19.2 6 168. 21.4 ## # … with 22 more rows
mtcars %>% arrange(cyl,desc(mpg)) %>% group_by(cyl) %>% mutate(n=1:n()) %>% filter(n==1)
## # A tibble: 3 × 15 ## # Groups: cyl [3] ## mpg cyl disp hp drat wt qsec vs am gear carb cylQUADRATIC ## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 33.9 4 71.1 65 4.22 1.84 19.9 1 1 4 1 54.5 ## 2 21.4 6 258 110 3.08 3.22 19.4 1 0 3 1 86.5 ## 3 19.2 8 400 175 3.08 3.84 17.0 0 0 3 2 126. ## # … with 3 more variables: new_var <dbl>, name <chr>, n <int>
mtcars %>% arrange(cyl,mpg) %>% group_by(cyl) %>% mutate(n=1:n()) %>% mutate(type=ifelse(n==1,"Best in class","Other")) %>% select(cyl,mpg,type,name)
## # A tibble: 32 × 4 ## # Groups: cyl [3] ## cyl mpg type name ## <dbl> <dbl> <chr> <chr> ## 1 4 21.4 Best in class Volvo 142E ## 2 4 21.5 Other Toyota Corona ## 3 4 22.8 Other Datsun 710 ## 4 4 22.8 Other Merc 230 ## 5 4 24.4 Other Merc 240D ## 6 4 26 Other Porsche 914-2 ## 7 4 27.3 Other Fiat X1-9 ## 8 4 30.4 Other Honda Civic ## 9 4 30.4 Other Lotus Europa ## 10 4 32.4 Other Fiat 128 ## # … with 22 more rows
mtcars %>% group_by(cyl) %>% filter(hp==max(hp))
## # A tibble: 3 × 14 ## # Groups: cyl [3] ## mpg cyl disp hp drat wt qsec vs am gear carb cylQUADRATIC ## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 30.4 4 95.1 113 3.77 1.51 16.9 1 1 5 2 54.5 ## 2 19.7 6 145 175 3.62 2.77 15.5 0 1 5 6 86.5 ## 3 15 8 301 335 3.54 3.57 14.6 0 1 5 8 126. ## # … with 2 more variables: new_var <dbl>, name <chr>
mtcars %>% group_by(cyl) %>% top_n(1,hp)
## # A tibble: 3 × 14 ## # Groups: cyl [3] ## mpg cyl disp hp drat wt qsec vs am gear carb cylQUADRATIC ## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 30.4 4 95.1 113 3.77 1.51 16.9 1 1 5 2 54.5 ## 2 19.7 6 145 175 3.62 2.77 15.5 0 1 5 6 86.5 ## 3 15 8 301 335 3.54 3.57 14.6 0 1 5 8 126. ## # … with 2 more variables: new_var <dbl>, name <chr>
mtcars %>% group_by(cyl) %>% arrange(desc(hp)) %>% slice(1)
## # A tibble: 3 × 14 ## # Groups: cyl [3] ## mpg cyl disp hp drat wt qsec vs am gear carb cylQUADRATIC ## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 30.4 4 95.1 113 3.77 1.51 16.9 1 1 5 2 54.5 ## 2 19.7 6 145 175 3.62 2.77 15.5 0 1 5 6 86.5 ## 3 15 8 301 335 3.54 3.57 14.6 0 1 5 8 126. ## # … with 2 more variables: new_var <dbl>, name <chr>
mtcars %>% group_by(cyl,am) %>% count()
## # A tibble: 6 × 3 ## # Groups: cyl, am [6] ## cyl am n ## <dbl> <dbl> <int> ## 1 4 0 3 ## 2 4 1 8 ## 3 6 0 4 ## 4 6 1 3 ## 5 8 0 12 ## 6 8 1 2
mtcars %>% summarise(mean=mean(mpg))
## # A tibble: 1 × 1 ## mean ## <dbl> ## 1 20.1
mtcars %>% group_by(cyl) %>% summarise(mean=mean(mpg))
## # A tibble: 3 × 2 ## cyl mean ## <dbl> <dbl> ## 1 4 26.7 ## 2 6 19.7 ## 3 8 15.1
mtcars %>% group_by(cyl) %>% mean(mpg)
## Warning in mean.default(., mpg): argument is not numeric or logical: returning ## NA
## [1] NA
mtcars %>% group_by(cyl) %>% pull() %>% mean()
## Warning in mean.default(.): argument is not numeric or logical: returning NA
## [1] NA
mtcars %>%
group_by(cyl) %>%
summarise(median_mpg=median(mpg),
mean(mpg),
sd_mpg=sd(mpg),
n=n())
## # A tibble: 3 × 5 ## cyl median_mpg `mean(mpg)` sd_mpg n ## <dbl> <dbl> <dbl> <dbl> <int> ## 1 4 26 26.7 4.51 11 ## 2 6 19.7 19.7 1.45 7 ## 3 8 15.2 15.1 2.56 14
mtcars %>% mutate_at(.vars = vars(cyl,am),factor) %>% select(cyl,am) %>% glimpse()
## Rows: 32 ## Columns: 2 ## $ cyl <fct> 6, 6, 4, 6, 8, 6, 8, 4, 4, 6, 6, 8, 8, 8, 8, 8, 8, 4, 4, 4, 4, 8, … ## $ am <fct> 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, …
mtcars %>% group_by(cyl) %>% summarise_at(.vars=vars(mpg,disp,qsec),.funs=mean)
## # A tibble: 3 × 4 ## cyl mpg disp qsec ## <dbl> <dbl> <dbl> <dbl> ## 1 4 26.7 105. 19.1 ## 2 6 19.7 183. 18.0 ## 3 8 15.1 353. 16.8
mtcars %>%
group_by(cyl) %>%
summarise_at(.vars=vars(mpg,disp,qsec),
.funs=list(mean = mean, median = median, max=max,min=min))
## # A tibble: 3 × 13 ## cyl mpg_mean disp_mean qsec_mean mpg_median disp_median qsec_median mpg_max ## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 4 26.7 105. 19.1 26 108 18.9 33.9 ## 2 6 19.7 183. 18.0 19.7 168. 18.3 21.4 ## 3 8 15.1 353. 16.8 15.2 350. 17.2 19.2 ## # … with 5 more variables: disp_max <dbl>, qsec_max <dbl>, mpg_min <dbl>, ## # disp_min <dbl>, qsec_min <dbl>
Package to clean data and create tidy data. Tidy data is:
tidy_df <- untidy_df %>%
pivot_longer(-age,names_to = c("gender","year"),
values_to = "value",names_sep = "_")
tidy_df
## # A tibble: 78 × 4 ## age gender year value ## <int> <chr> <chr> <dbl> ## 1 18 male 2016 59564 ## 2 18 female 2016 54647 ## 3 18 male 2017 61525 ## 4 18 female 2017 46469 ## 5 18 male 2018 43561 ## 6 18 female 2018 57906 ## 7 19 male 2016 49530 ## 8 19 female 2016 48047 ## 9 19 male 2017 52844 ## 10 19 female 2017 50596 ## # … with 68 more rows
pivot_wider(tidy_df,
names_from=c("gender","year"),
values_from="value")
## # A tibble: 13 × 7 ## age male_2016 female_2016 male_2017 female_2017 male_2018 female_2018 ## <int> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> ## 1 18 59564 54647 61525 46469 43561 57906 ## 2 19 49530 48047 52844 50596 53742 50753 ## 3 20 52298 65491 47306 58496 52360 47564 ## 4 21 54563 40586 49413 50184 49165 50879 ## 5 22 58411 52824 49071 54078 54008 55440 ## 6 23 52324 56537 51123 47459 50506 51485 ## 7 24 56501 53670 56503 54379 46414 44554 ## 8 25 54753 45761 53031 47378 51845 49421 ## 9 26 59393 56615 46247 52151 55203 38155 ## 10 27 52496 47475 48400 36441 48584 54391 ## 11 28 53883 46741 49376 55957 43365 39146 ## 12 29 42629 44526 50054 44947 53615 44120 ## 13 30 46296 43434 50805 49060 44310 40083
ggplot(). Then add layers with geom_point(),geom_line(),geom_bar(),geom_boxplot()..etc.ggplot(data=mtcars,aes(x=hp,y=mpg)) + geom_point()
ggplot(data=mtcars,aes(x=hp,y=mpg,color=factor(cyl))) + geom_point()
g <- ggplot(data=mtcars,aes(x=hp,y=mpg,color=factor(cyl))) +
geom_point() + geom_smooth(method="lm",se=FALSE) +
geom_smooth(aes(x=hp,y=mpg,color="1"),color="black",method = "lm",se=FALSE) +
labs(x="Some X-ax title",y="Some Y-ax title",title="Some title",
color="Cyl",subtitle = "Some subtitle",caption="Source: My source") +
scale_x_continuous(limits=c(0,400)) + scale_y_continuous(limits=c(0,50)) +
theme_dark() + theme(legend.title = element_text(size=12)) +
scale_color_manual(values=c("grey","green","pink"))
print(g)
If you are interested in Data Viz, I highly recommend:
Data Visualization and Data Exploration using R
with Dr. Alicia Eads
Thursday, April 28, 1-4pm, SSMW 2022
https://dataverse.harvard.edu/dataset.xhtml?persistentId=doi:10.7910/DVN/8RHLG1
library(readstata13)
df <- read.dta13("data/2019 Canadian Election Study - Phone Survey v1.0.dta")
ces_2019_subset <- df %>%
select(q2,q3,q4,q6,q9,q10,q11,q14,q15,q31,q32,weight_CES) %>%
as_tibble()
ces_2019_subset %>% glimpse()
## Rows: 4,021 ## Columns: 12 ## $ q2 <int> 1963, 1973, 1994, 2000, 1984, 1939, 1999, 1995, 1963, 1970,… ## $ q3 <fct> (1) Male, (1) Male, (1) Male, (1) Male, (1) Male, (1) Male,… ## $ q4 <fct> (5) Quebec, (5) Quebec, (5) Quebec, (5) Quebec, (5) Quebec,… ## $ q6 <fct> (3) Not very satisfied, (2) Fairly satisfied, (1) Very sati… ## $ q9 <fct> (8), (10) Great deal of interest, (10) Great deal of intere… ## $ q10 <fct> (1) Certain, (1) Certain, (1) Certain, (1) Certain, (1) Cer… ## $ q11 <fct> "(-9) Don't know / Undecided", "(-9) Don't know / Undecided… ## $ q14 <int> 60, 70, 70, 75, 10, 0, 50, 65, 50, 70, 15, 40, 50, 50, 90, … ## $ q15 <int> 40, 55, 60, 40, 10, 30, 20, 25, 80, 10, 50, 25, 75, 0, -6, … ## $ q31 <fct> (2) Worse, (3) About the same, (1) Better, (3) About the sa… ## $ q32 <fct> (3) Not made much difference, (3) Not made much difference,… ## $ weight_CES <dbl> 0.9019529, 0.9019529, 0.9019529, 1.2334642, 0.9019529, 0.90…
ces_2019_subset <- ces_2019_subset %>%
rename(q2_birthyear=q2,q3_gender=q3,q4_province=q4,
q6_satisfied_democracy=q6,q9_interest_election=q9,
q10_certain_vote=q10,q11_vote_intention=q11,
q14_feeling_liberal_party=q14,q15_feeling_cons_party=q15,
q31_ecnchange=q31,q32_policies_fed_gov_ecn=q32) %>%
filter(q3_gender %in% c("(1) Male","(2) Female"))
ces_2019_subset <- ces_2019_subset %>%
# Q14 and Q15 have negative values which should be NAs
# See codebook
mutate(q14_feeling_liberal_party=ifelse(q14_feeling_liberal_party<0,
NA,q14_feeling_liberal_party),
q15_feeling_cons_party=ifelse(q15_feeling_cons_party<0,
NA,q15_feeling_cons_party))
ces_2019_subset <- ces_2019_subset %>%
mutate(q11_vote_intention=fct_recode(q11_vote_intention,
"(4) Bloc Québécois (BQ, PQ, Bloc, Parti Québécois)"=
"(4) Bloc Québécois (BQ, PQ, Bloc, Parti Québécois)"))
ces_2019_subset %>% group_by(q3_gender) %>% summarise(avg_feeling_lib=mean(q14_feeling_liberal_party,na.rm=TRUE))
## # A tibble: 2 × 2 ## q3_gender avg_feeling_lib ## <fct> <dbl> ## 1 (1) Male 44.3 ## 2 (2) Female 51.8
to_plot <- ces_2019_subset %>% group_by(q3_gender,q4_province) %>% summarise(avg_feeling_lib=mean(q14_feeling_liberal_party,na.rm=TRUE))
## `summarise()` has grouped output by 'q3_gender'. You can override using the ## `.groups` argument.
to_plot
## # A tibble: 20 × 3 ## # Groups: q3_gender [2] ## q3_gender q4_province avg_feeling_lib ## <fct> <fct> <dbl> ## 1 (1) Male (1) Newfoundland and Labrador 45.3 ## 2 (1) Male (2) Prince Edward Island 46.5 ## 3 (1) Male (3) Nova Scotia 49.4 ## 4 (1) Male (4) New Brunswick 47.8 ## 5 (1) Male (5) Quebec 49.2 ## 6 (1) Male (6) Ontario 50.3 ## 7 (1) Male (7) Manitoba 40.7 ## 8 (1) Male (8) Saskatchewan 32.1 ## 9 (1) Male (9) Alberta 24.7 ## 10 (1) Male (10) British Columbia 43.3 ## 11 (2) Female (1) Newfoundland and Labrador 57.1 ## 12 (2) Female (2) Prince Edward Island 57.8 ## 13 (2) Female (3) Nova Scotia 55.3 ## 14 (2) Female (4) New Brunswick 50.6 ## 15 (2) Female (5) Quebec 55.8 ## 16 (2) Female (6) Ontario 54.2 ## 17 (2) Female (7) Manitoba 44.9 ## 18 (2) Female (8) Saskatchewan 36.3 ## 19 (2) Female (9) Alberta 36.9 ## 20 (2) Female (10) British Columbia 53.2
to_plot %>% filter(q3_gender!="(3) Other") %>% ggplot(aes(x=q3_gender,y=avg_feeling_lib,fill=q4_province)) + geom_bar(stat = "identity",position="dodge")
to_plot %>% filter(q3_gender!="(3) Other") %>% ggplot(aes(x=q3_gender,y=avg_feeling_lib,fill=q4_province)) + geom_bar(stat = "identity",position="dodge") + geom_text(aes(label=round(avg_feeling_lib)),position = position_dodge(width = .9),vjust=-0.25)
to_plot <- ces_2019_subset %>%
mutate(q4_province_2=fct_recode(q4_province,"Atlantic"="(1) Newfoundland and Labrador",
"Atlantic"="(2) Prince Edward Island",
"Atlantic"="(3) Nova Scotia",
"Atlantic"="(4) New Brunswick")) %>%
group_by(q3_gender,q4_province_2) %>%
summarise(avg_feeling_lib=mean(q14_feeling_liberal_party,na.rm=TRUE))
## `summarise()` has grouped output by 'q3_gender'. You can override using the ## `.groups` argument.
to_plot %>%
filter(q3_gender!="(3) Other") %>%
ggplot(aes(x=q3_gender,y=avg_feeling_lib,fill=q4_province_2)) +
geom_bar(stat = "identity",position="dodge") +
geom_text(aes(label=round(avg_feeling_lib)),position =
position_dodge(width = .9),vjust=-0.25)
to_plot <- ces_2019_subset %>%
mutate(q4_province_2=fct_recode(q4_province,"Atlantic"="(1) Newfoundland and Labrador",
"Atlantic"="(2) Prince Edward Island",
"Atlantic"="(3) Nova Scotia",
"Atlantic"="(4) New Brunswick")) %>%
group_by(q3_gender,q4_province_2) %>%
summarise(avg_feeling_lib=weighted.mean(q14_feeling_liberal_party,
weight_CES,na.rm=TRUE))
## `summarise()` has grouped output by 'q3_gender'. You can override using the ## `.groups` argument.
to_plot %>%
filter(q3_gender!="(3) Other") %>% ggplot(aes(x=q3_gender,y=avg_feeling_lib,
fill=q4_province_2)) + geom_bar(stat = "identity",position="dodge") +
geom_text(aes(label=round(avg_feeling_lib)),
position =position_dodge(width = .9),vjust=-0.25)
to_plot <- ces_2019_subset %>%
mutate(q4_province_2=fct_recode(q4_province,"Atlantic"="(1) Newfoundland and Labrador",
"Atlantic"="(2) Prince Edward Island",
"Atlantic"="(3) Nova Scotia",
"Atlantic"="(4) New Brunswick")) %>%
group_by(q3_gender,q4_province_2) %>%
summarise(avg_feeling_cons=weighted.mean(q15_feeling_cons_party,weight_CES,na.rm=TRUE),
avg_feeling_lib=weighted.mean(q14_feeling_liberal_party,weight_CES,na.rm=TRUE))
## `summarise()` has grouped output by 'q3_gender'. You can override using the ## `.groups` argument.
to_plot %>%
pivot_longer(c("avg_feeling_cons","avg_feeling_lib"),names_to = "party",values_to = "value") %>% filter(q3_gender!="(3) Other") %>%
ggplot(aes(x=q3_gender,y=value,fill=party)) + geom_bar(stat = "identity",position="dodge") +
geom_text(aes(label=round(value)),position = position_dodge(width = .9),vjust=-0.25) +
facet_wrap(~q4_province_2) + scale_fill_manual(values=c("darkblue","red"))
to_plot <- ces_2019_subset %>%
filter(q31_ecnchange %in% c("(1) Better", "(2) Worse", "(3) About the same")) %>%
mutate(q4_province_2=fct_recode(q4_province,"Atlantic"="(1) Newfoundland and Labrador",
"Atlantic"="(2) Prince Edward Island",
"Atlantic"="(3) Nova Scotia",
"Atlantic"="(4) New Brunswick")) %>%
group_by(q3_gender,q4_province_2,q31_ecnchange) %>%
summarise(sum_weightsQ31=sum(weight_CES)) %>%
group_by(q3_gender,q4_province_2) %>%
mutate(pc=sum_weightsQ31/sum(sum_weightsQ31))
## `summarise()` has grouped output by 'q3_gender', 'q4_province_2'. You can ## override using the `.groups` argument.
to_plot %>% ggplot(aes(x=q31_ecnchange,y=pc,fill=q3_gender)) + facet_wrap(~q4_province_2) + geom_bar(stat = "identity",position="dodge") + geom_text(aes(label=round(pc*100)),position = position_dodge(width = .9),vjust=-0.25)
\[Feeling Lib Party_i = B0+B1*PoliciesFedGovEcn_i+B2*birthyear_i+B3*gender_i+u_i\]
ces_2019_subset <- ces_2019_subset %>%
filter(!(q32_policies_fed_gov_ecn %in%
c("(-9) Don't know", "(-8) Refused", "(-7) Skipped"))) %>%
mutate(q32_policies_fed_gov_ecn_numeric=
as.numeric(factor(q32_policies_fed_gov_ecn,
c("(2) Worse","(3) Not made much difference","(1) Better"))))
to_model <- ces_2019_subset %>%
select(q14_feeling_liberal_party,q32_policies_fed_gov_ecn_numeric,
q2_birthyear,q3_gender,q4_province) %>%
filter(complete.cases(.))
m <- lm(q14_feeling_liberal_party~q32_policies_fed_gov_ecn_numeric+
q2_birthyear+q3_gender+q4_province,to_model)
summary(m)
## ## Call: ## lm(formula = q14_feeling_liberal_party ~ q32_policies_fed_gov_ecn_numeric + ## q2_birthyear + q3_gender + q4_province, data = to_model) ## ## Residuals: ## Min 1Q Median 3Q Max ## -74.774 -17.181 1.591 17.562 80.162 ## ## Coefficients: ## Estimate Std. Error t value Pr(>|t|) ## (Intercept) -207.40471 45.73019 -4.535 5.93e-06 *** ## q32_policies_fed_gov_ecn_numeric 23.49835 0.58244 40.345 < 2e-16 *** ## q2_birthyear 0.10642 0.02322 4.584 4.72e-06 *** ## q3_gender(2) Female 5.74741 0.78113 7.358 2.29e-13 *** ## q4_province(2) Prince Edward Island -2.40598 2.44278 -0.985 0.324718 ## q4_province(3) Nova Scotia -2.21332 2.44652 -0.905 0.365692 ## q4_province(4) New Brunswick -0.97488 2.42932 -0.401 0.688225 ## q4_province(5) Quebec -2.10565 1.95436 -1.077 0.281365 ## q4_province(6) Ontario 0.87815 1.94424 0.452 0.651533 ## q4_province(7) Manitoba -3.53888 2.32003 -1.525 0.127253 ## q4_province(8) Saskatchewan -8.12828 2.32169 -3.501 0.000469 *** ## q4_province(9) Alberta -10.46679 2.27344 -4.604 4.28e-06 *** ## q4_province(10) British Columbia -1.26361 1.95301 -0.647 0.517669 ## --- ## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 ## ## Residual standard error: 23.57 on 3741 degrees of freedom ## Multiple R-squared: 0.3549, Adjusted R-squared: 0.3528 ## F-statistic: 171.5 on 12 and 3741 DF, p-value: < 2.2e-16
library(emmeans)
emmeans(m,specs = "q32_policies_fed_gov_ecn_numeric",
at=list(q32_policies_fed_gov_ecn_numeric=c(1,2,3)))
## q32_policies_fed_gov_ecn_numeric emmean SE df lower.CL upper.CL ## 1 25.3 0.690 3741 24.0 26.7 ## 2 48.8 0.463 3741 47.9 49.8 ## 3 72.3 0.794 3741 70.8 73.9 ## ## Results are averaged over the levels of: q3_gender, q4_province ## Confidence level used: 0.95
library(emmeans)
emmeans(m,specs = "q32_policies_fed_gov_ecn_numeric",
at=list(q32_policies_fed_gov_ecn_numeric=c(1,3))) %>%
contrast( method = "pairwise", infer=TRUE)
## contrast estimate SE df lower.CL upper.CL t.ratio p.value ## 1 - 3 -47 1.16 3741 -49.3 -44.7 -40.345 <.0001 ## ## Results are averaged over the levels of: q3_gender, q4_province ## Confidence level used: 0.95
to_model$prediction <- predict(m)
myrmse <- round(sqrt(mean((to_model$q14_feeling_liberal_party-
to_model$prediction)^2)),1)
ggplot(to_model,aes(x=q14_feeling_liberal_party,y=prediction)) +
geom_jitter() +
labs(title=paste0("predictions and true values plotted: rmse=",myrmse))
58 cases from the SCC in 2021. Can be accesses through link below. Can also be downloaded as a single zip.
library(tidyverse)
library(textreadr)
library(ldatuning)
library(topicmodels)
# https://decisions.scc-csc.ca/scc-csc/scc-csc/en/2021/nav_date.do
download.file("justinsavoie.com/data/2021.zip","~/Downloads/2021.zip")
setwd("..where the unzipped data is..")
folder_2021 <- "SCC/2021/"
See https://www.tidytextmining.com/topicmodeling.html for more info on tidy text analysis.
paths <- paste0(folder_2021,list.files(folder_2021))
my_list <- list()
for (k in 1:length(paths)){
doc <- textreadr::read_docx(paths[k])
citation <- str_replace(doc[2],"Citation: ","")
start <- which(grepl("Present: ",doc))+3
end <- which(grepl("Cases Cited",doc))-1
if (length(start)==0 & length(end)==0){
next
}
doc_sub <- doc[start:end]
df <- tibble(paragraph=paste0(doc_sub,collapse=" "),citation=citation)
library(tidytext)
my_list[[k]] <- df %>%
unnest_tokens(word, paragraph) %>%
mutate(word=str_replace(word,"\\.","")) %>%
filter(!grepl('[0-9]',word)) %>%
filter(nchar(word)>4)
}
df_words <- bind_rows(my_list)
df_words %>% group_by(citation,word) %>% summarise(n=n()) %>% group_by(citation) %>% mutate(pc=n/sum(n)) %>% arrange(citation,desc(pc)) %>% group_by(citation) %>% slice(1:5) %>% ungroup() %>% ggplot(aes(x=word,y=pc)) + geom_bar(stat="identity") + facet_wrap(~citation,scales = "free")
dtm <- cast_dtm(df_words%>%group_by(citation,word)%>%summarise(n=n()),
citation,word,n)
modelLDA <- LDA(dtm,k=3)
wtp <- tidy(modelLDA,matrix="beta")
wtp %>%
arrange(topic,desc(beta)) %>%
group_by(topic) %>%
slice(1:5) %>%
mutate(term = reorder_within(term, beta, topic)) %>%
ggplot(aes(y=term,fill=factor(topic),x=beta)) +
geom_bar(stat = "identity",position = "dodge") +
facet_wrap(~topic,ncol=1,scales = "free") + scale_y_reordered()
Discussion of ideal number of topics https://cran.r-project.org/web/packages/ldatuning/vignettes/topics.html
result <- FindTopicsNumber(
dtm,
topics = seq(from = 2, to = 50, by = 1),
metrics = c("Griffiths2004", "CaoJuan2009", "Arun2010", "Deveaud2014"),
method = "Gibbs",
verbose = TRUE
)
FindTopicsNumber_plot(result)
modelLDA <- LDA(dtm,k=9)
wtp <- tidy(modelLDA,matrix="beta")
wtp %>% arrange(topic,desc(beta)) %>% group_by(topic) %>% slice(1:10) %>% mutate(term = reorder_within(term, beta, topic)) %>% ggplot(aes(y=term,x=beta)) + geom_bar(stat = "identity",position = "dodge") + facet_wrap(~topic,scales = "free") + scale_y_reordered()